Code
files <- str_c("../data/",list.files("../data/", pattern = "RData"))
for (file in files) load(file)files <- str_c("../data/",list.files("../data/", pattern = "RData"))
for (file in files) load(file)dbExecute(db,
"CREATE TABLE IF NOT EXISTS doc_types (
id SMALLINT,
doc_type TEXT,
PRIMARY KEY (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works (
id TEXT,
pub_year SMALLINT,
title TEXT,
abstract TEXT,
doi TEXT,
journal TEXT,
PRIMARY KEY (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works_doc_types (
work_id TEXT,
doc_type_id SMALLINT,
PRIMARY KEY (work_id, doc_type_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (doc_type_id) REFERENCES doc_types (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS citation_indexes (
id SMALLINT,
ci TEXT,
PRIMARY KEY (id));")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works_index (
work_id TEXT,
citation_index_id SMALLINT,
PRIMARY KEY (work_id, citation_index_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (citation_index_id) REFERENCES citation_indexes (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS classification (
id SMALLINT,
classification TEXT,
PRIMARY KEY (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works_classification (
work_id TEXT,
classification_id SMALLINT,
PRIMARY KEY (work_id, classification_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (classification_id) REFERENCES classification (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS keywords (
id int,
keyword text,
PRIMARY KEY (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works_keywords (
work_id text,
keyword_id int,
PRIMARY KEY (work_id, keyword_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (keyword_id) REFERENCES keywords (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS citations (
item_id_citing text,
item_id_cited text,
PRIMARY KEY (item_id_citing, item_id_cited),
FOREIGN KEY (item_id_cited) REFERENCES works (id),
FOREIGN KEY (item_id_citing) REFERENCES works (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS clusters (
id INT,
set TEXT,
network_method TEXT,
clustering_method TEXT,
component INT,
cluster INT,
PRIMARY KEY (id)
);")
dbExecute(db,
"CREATE TABLE IF NOT EXISTS works_clusters (
work_id TEXT,
cluster_id INT,
PRIMARY KEY (work_id, cluster_id),
FOREIGN KEY (work_id) REFERENCES works (id),
FOREIGN KEY (cluster_id) REFERENCES clusters (id)
);")# -------------------------------------
# Import initial publications set
# -------------------------------------
dbWriteTable(db, "works", open_science_init_set %>%
left_join(open_science_init_set_abstracts, by="item_id") %>%
select(id = item_id,
pub_year = pubyear,
title = item_title,
abstract,
doi,
journal = source_title) %>%
filter(!id %in% dbReadTable(db, "works")$id),
row.names=F,
append=T)
# -------------------------------------
# Import extented publications set
# -------------------------------------
dbWriteTable(db, "works", joint_set_citations_and_references_from_intial_set %>%
left_join(joint_set_abstracts, by="item_id") %>%
select(id = item_id,
pub_year = pubyear,
title = item_title,
abstract,
doi,
journal = source_title) %>%
filter(!id %in% dbReadTable(db, "works")$id), row.names=F, append=T)# -------------------------------------
# Unique keywords
# -------------------------------------
# Prepare data
works_keywords<-open_science_init_set %>%
select(work_id = item_id, keyword) %>%
separate_rows(keyword, sep = ',') %>%
mutate(keyword = str_remove_all(keyword, "[{}\"]")) %>%
bind_rows(joint_set_citations_and_references_from_intial_set %>%
select(work_id = item_id, keyword) %>%
separate_rows(keyword, sep = ',') %>%
mutate(keyword = str_remove_all(keyword, "[{}\"]"))) %>%
unique()
keywords <- works_keywords %>%
select(keyword) %>%
drop_na() %>%
unique() %>%
rownames_to_column("id")
works_keywords <- works_keywords %>%
inner_join(keywords, by="keyword") %>%
select(work_id, keyword_id = id)
# Write to database
dbWriteTable(db, "keywords",
keywords,
row.names=F,
append = T)
dbWriteTable(db, "works_keywords",
works_keywords,
row.names=F,
append =T)works <- dbGetQuery(db, "select id from works")
# -----------------------------------------------
# Citations to the initial publication set
# -----------------------------------------------
dbWriteTable(db,
"citations",
open_science_init_set_citations_ids,
row.names = F,
append = T)
# -----------------------------------------------
# References from the initial publication set
# -----------------------------------------------
data<-open_science_init_set_refs_ids %>%
anti_join(dbReadTable(db, "citations"),
by=c("item_id_citing","item_id_cited")) %>%
filter(item_id_cited %in% works$id) %>%
filter(item_id_citing %in% works$id) %>%
select(-item_id) %>%
unique() %>%
drop_na()
dbWriteTable(db,
"citations",
data,
row.names = F,
append = T)
# ---------------------------------------------
# Citations to the extended publication set
# ---------------------------------------------
data<-joint_set_citations_ids %>%
anti_join(dbReadTable(db, "citations"),
by=c("item_id_citing","item_id_cited")) %>%
filter(item_id_cited %in% works$id) %>%
filter(item_id_citing %in% works$id) %>%
unique() %>%
drop_na()
dbWriteTable(db,
"citations",
data,
row.names = F,
append = T)
# -----------------------------------------------
# References from the extended publication set
# -----------------------------------------------
data<-joint_set_refs_ids %>%
anti_join(dbReadTable(db, "citations"),
by=c("item_id_citing","item_id_cited")) %>%
filter(item_id_cited %in% works$id) %>%
filter(item_id_citing %in% works$id) %>%
select(-item_id) %>%
unique() %>%
drop_na()
dbWriteTable(db,
"citations",
data,
row.names = F,
append = T)citations<-dbReadTable(db, "citations")
load("../data/open_science_init_set.RData")
core<-open_science_init_set %>%
select(id = item_id)
# -----------------------------
# Produce network files
# -----------------------------
# BC
citations %>%
inner_join(citations, by="item_id_cited") %>%
select(source = item_id_citing.x, target = item_id_citing.y) %>%
filter(source < target) %>%
filter(source %in% core$id) %>%
filter(target %in% core$id) %>%
group_by(source, target) %>%
reframe(weight = n()) %>%
mutate(type = "undirected") %>%
unique() %>%
write_csv("../data/networks_initial_set/net_bc.csv")
# DC
citations %>%
rename(source = item_id_citing, target = item_id_cited) %>%
filter(source %in% core$id) %>%
filter(target %in% core$id) %>%
mutate(weight = 1,
type="directed") %>%
unique() %>%
write_csv("../data/networks_initial_set/net_dc.csv")
# CC
citations %>%
inner_join(citations, by="item_id_citing") %>%
select(source = item_id_cited.x, target = item_id_cited.y) %>%
filter(source < target) %>%
filter(source %in% core$id) %>%
filter(target %in% core$id) %>%
group_by(source, target) %>%
summarize(weight = n()) %>%
mutate(type = "undirected") %>%
unique() %>%
write_csv("../data/networks_initial_set/net_cc.csv")
# BC_CC_DC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
read_csv("../data/networks_initial_set/net_cc.csv"),
read_csv("../data/networks_initial_set/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks_initial_set/net_bc_cc_dc.csv")
# BC_CC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
read_csv("../data/networks_initial_set/net_cc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks_initial_set/net_bc_cc.csv")
# BC_DC
bind_rows(read_csv("../data/networks_initial_set/net_bc.csv"),
read_csv("../data/networks_initial_set/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks_initial_set/net_bc_dc.csv")
# CC_DC
bind_rows(read_csv("../data/networks_initial_set/net_cc.csv"),
read_csv("../data/networks_initial_set/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks_initial_set/net_cc_dc.csv")## nodes files (with clusters) ----
net_files <- tibble(file = list.files("../data/networks_initial_set/")) %>%
filter(str_starts(file, "net_"))
for(file in net_files$file) {
network <- graph_from_data_frame(
read.csv(str_c("../data/networks_initial_set/",file)),
directed = F)
V(network)$comp <- components(network)$membership
#network <- induced_subgraph(network, V(network)$comp==1)
V(network)$cluster_louvain <- cluster_louvain(network, weights = c(E(network)$Weight))$membership
V(network)$cluster_leiden <- cluster_leiden(network, weights = c(E(network)$Weight))$membership
# V(network)$degree <- degree(network)
# V(network)$closeness <- closeness(network)
# V(network)$eigen_centrality <- eigen_centrality(network)$vector
select(as_data_frame(network, "both")$vertices,
id = name,
component = comp,
cluster_louvain,
cluster_leiden) %>%
write_csv(str_c("../data/networks_initial_set/",
str_replace(file, "net", "nodes")))
}clusters_bc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_bc") %>%
mutate(set = "expanded",
network_method = "bc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_bc_cc") %>%
mutate(set = "expanded",
network_method = "bc_cc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_bc_cc_dc") %>%
mutate(set = "expanded",
network_method = "bc_cc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_bc_dc") %>%
mutate(set = "expanded",
network_method = "bc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_cc") %>%
mutate(set = "expanded",
network_method = "cc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_cc_dc") %>%
mutate(set = "expanded",
network_method = "cc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_dc<-read_xlsx("../data/networks_initial_set/clusters_louvain.xlsx",
sheet = "nodes_dc") %>%
mutate(set = "expanded",
network_method = "dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters<-bind_rows(clusters_bc,
clusters_cc,
clusters_dc,
clusters_bc_cc,
clusters_bc_dc,
clusters_cc_dc,
clusters_bc_cc_dc) %>%
unique() %>%
rename(clustering_method=cluster_method) %>%
rownames_to_column("id") %>%
# mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>%
mutate(set = "core")
dbWriteTable(db, "clusters", clusters, row.names=F, append = T)
works_clusters_bc<-read.csv("../data/networks_initial_set/nodes_bc.csv") %>%
mutate(network_method = "bc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc<-read.csv("../data/networks_initial_set/nodes_cc.csv") %>%
mutate(network_method = "cc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_dc<-read.csv("../data/networks_initial_set/nodes_dc.csv") %>%
mutate(network_method = "dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc<-read.csv("../data/networks_initial_set/nodes_bc_cc.csv") %>%
mutate(network_method = "bc_cc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_dc<-read.csv("../data/networks_initial_set/nodes_bc_dc.csv") %>%
mutate(network_method = "bc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc_dc<-read.csv("../data/networks_initial_set/nodes_cc_dc.csv") %>%
mutate(network_method = "cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc_dc<-read.csv("../data/networks_initial_set/nodes_bc_cc_dc.csv") %>%
mutate(network_method = "bc_cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters<-bind_rows(works_clusters_bc,
works_clusters_cc,
works_clusters_dc,
works_clusters_bc_cc,
works_clusters_bc_dc,
works_clusters_cc_dc,
works_clusters_bc_cc_dc) %>%
unique() %>%
inner_join(clusters %>%
select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>%
select(work_id, cluster_id = id)
dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T) clusters_bc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_bc") %>%
mutate(set = "expanded",
network_method = "bc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_bc_cc") %>%
mutate(set = "expanded",
network_method = "bc_cc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_bc_cc_dc") %>%
mutate(set = "expanded",
network_method = "bc_cc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_bc_dc") %>%
mutate(set = "expanded",
network_method = "bc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_cc") %>%
mutate(set = "expanded",
network_method = "cc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_cc_dc") %>%
mutate(set = "expanded",
network_method = "cc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_dc<-read_xlsx("../data/networks_initial_set/clusters_leiden.xlsx",
sheet = "nodes_dc") %>%
mutate(set = "expanded",
network_method = "dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters<-bind_rows(clusters_bc,
clusters_cc,
clusters_dc,
clusters_bc_cc,
clusters_bc_dc,
clusters_cc_dc,
clusters_bc_cc_dc) %>%
unique() %>%
rename(clustering_method=cluster_method) %>%
rownames_to_column("id") %>%
mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>%
mutate(set = "core")
dbWriteTable(db, "clusters", clusters, row.names=F, append = T)
works_clusters_bc<-read.csv("../data/networks_initial_set/nodes_bc.csv") %>%
mutate(network_method = "bc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc<-read.csv("../data/networks_initial_set/nodes_cc.csv") %>%
mutate(network_method = "cc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_dc<-read.csv("../data/networks_initial_set/nodes_dc.csv") %>%
mutate(network_method = "dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc<-read.csv("../data/networks_initial_set/nodes_bc_cc.csv") %>%
mutate(network_method = "bc_cc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_dc<-read.csv("../data/networks_initial_set/nodes_bc_dc.csv") %>%
mutate(network_method = "bc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc_dc<-read.csv("../data/networks_initial_set/nodes_cc_dc.csv") %>%
mutate(network_method = "cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc_dc<-read.csv("../data/networks_initial_set/nodes_bc_cc_dc.csv") %>%
mutate(network_method = "bc_cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters<-bind_rows(works_clusters_bc,
works_clusters_cc,
works_clusters_dc,
works_clusters_bc_cc,
works_clusters_bc_dc,
works_clusters_cc_dc,
works_clusters_bc_cc_dc) %>%
unique() %>%
inner_join(clusters %>%
select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>%
select(work_id, cluster_id = id)
dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T) ## List of nodes files ----
node_files <- tibble(file = list.files("../data/networks_initial_set/")) %>%
filter(str_starts(file, "nodes_")) %>%
mutate(file = str_c("../data/networks_initial_set/",file))
net<-list()
for(file in node_files$file) {
net<-c(net, list(read_csv(file) %>%
rename(cluster = cluster_louvain)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks_initial_set/"),"\\.csv")
## Cluster descriptions ----
i=1
clusters<-list()
for(i in 1:length(net)) {
### Component ----
cluster_component <- net[[i]] %>%
select(cluster, component) %>%
unique()
### Size ----
cluster_size<-net[[i]] %>%
group_by(cluster) %>%
summarize(n = n())
### Number of core papers ----
load("../data/open_science_init_set.RData")
cluster_core_papers<-net[[i]] %>%
inner_join(open_science_init_set %>%
select(id = item_id),
by="id") %>%
group_by(cluster) %>%
summarize(n_core = n())
rm(open_science_init_set)
### Journals ----
clusters_journals <- net[[i]] %>%
inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
by="id") %>%
mutate(total = n()) %>%
group_by(journal) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, journal) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, journal) %>%
mutate(journal = paste(journal, collapse="; ")) %>%
unique() %>%
ungroup()
### keywords
clusters_keywords <- net[[i]] %>%
inner_join(dbGetQuery(db,
"SELECT DISTINCT a.id, c.keyword
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id"),
by="id") %>%
mutate(total = n()) %>%
mutate(keyword = lemmatize_strings(keyword)) %>%
group_by(keyword) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, keyword) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, keyword) %>%
mutate(keyword = paste(keyword, collapse="; ")) %>%
unique() %>%
ungroup()
### Most cited papers ----
cluster_citations <- net[[i]] %>%
inner_join(dbGetQuery(db,
"
SELECT DISTINCT
a.id,
a.title,
a.pub_year,
a.journal,
count(distinct b.item_id_citing) as cited_by_count
FROM works a
JOIN citations b on b.item_id_cited = a.id
GROUP BY a.id, a.title, a.pub_year, a.journal"),
by="id") %>%
mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>%
group_by(cluster) %>%
arrange(desc(cited_by_count)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, papers) %>%
mutate(papers = paste(papers, collapse="; ")) %>%
unique() %>%
ungroup()
### putting it all together----
x<- cluster_component %>%
inner_join(cluster_size, by="cluster") %>%
inner_join(cluster_core_papers, by="cluster") %>%
inner_join(clusters_journals, by="cluster") %>%
inner_join(clusters_keywords, by="cluster") %>%
inner_join(cluster_citations, by="cluster") %>%
rename("Number of publications" = n,
"Number of core publications" = n_core,
"Top journals" = journal,
"Top keywords" = keyword,
"Top cited papers" = papers)
x = list(x)
names(x)<-names(net)[i]
clusters<-c(clusters, x)
# writexl::write_xlsx(str_c("data/networks_initial_set/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks_initial_set/clusters_louvain.xlsx") ## List of nodes files ----
node_files <- tibble(file = list.files("../data/networks_initial_set/")) %>%
filter(str_starts(file, "nodes_")) %>%
mutate(file = str_c("../data/networks_initial_set/",file))
net<-list()
for(file in node_files$file) {
net<-c(net, list(read_csv(file, show_col_types = F) %>%
rename(cluster = cluster_leiden)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks_initial_set/"),"\\.csv")
## Cluster descriptions ----
i=1
clusters<-list()
for(i in 1:length(net)) {
### Component ----
cluster_component <- net[[i]] %>%
select(cluster, component) %>%
unique()
### Size ----
cluster_size<-net[[i]] %>%
group_by(cluster) %>%
summarize(n = n())
### Number of core papers
load("../data/open_science_init_set.RData")
cluster_core_papers<-net[[i]] %>%
inner_join(open_science_init_set %>%
select(id = item_id),
by="id") %>%
group_by(cluster) %>%
summarize(n_core = n())
rm(open_science_init_set)
### Journals ----
clusters_journals <- net[[i]] %>%
inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
by="id") %>%
mutate(total = n()) %>%
group_by(journal) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, journal) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, journal) %>%
mutate(journal = paste(journal, collapse="; ")) %>%
unique() %>%
ungroup()
### keywords
clusters_keywords <- net[[i]] %>%
inner_join(dbGetQuery(db,
"SELECT DISTINCT a.id, c.keyword
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id"),
by="id") %>%
mutate(total = n()) %>%
mutate(keyword = lemmatize_strings(keyword)) %>%
group_by(keyword) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, keyword) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, keyword) %>%
mutate(keyword = paste(keyword, collapse="; ")) %>%
unique() %>%
ungroup()
### Most cited papers ----
cluster_citations <- net[[i]] %>%
inner_join(dbGetQuery(db,
"
SELECT DISTINCT
a.id,
a.title,
a.pub_year,
a.journal,
count(distinct b.item_id_citing) as cited_by_count
FROM works a
JOIN citations b on b.item_id_cited = a.id
GROUP BY a.id, a.title, a.pub_year, a.journal"),
by="id") %>%
mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>%
group_by(cluster) %>%
arrange(desc(cited_by_count)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, papers) %>%
mutate(papers = paste(papers, collapse="; ")) %>%
unique() %>%
ungroup()
### putting it all together----
x<- cluster_component %>%
inner_join(cluster_size, by="cluster") %>%
inner_join(cluster_core_papers, by="cluster") %>%
inner_join(clusters_journals, by="cluster") %>%
inner_join(clusters_keywords, by="cluster") %>%
inner_join(cluster_citations, by="cluster") %>%
rename("Number of publications" = n,
"Number of core publications" = n_core,
"Top journals" = journal,
"Top keywords" = keyword,
"Top cited papers" = papers)
x = list(x)
names(x)<-names(net)[i]
clusters<-c(clusters, x)
# writexl::write_xlsx(str_c("data/networks_initial_set/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks_initial_set/clusters_leiden.xlsx") citations<-dbReadTable(db, "citations")
# -----------------------------
# Produce network files
# -----------------------------
# BC
citations %>%
inner_join(citations, by="item_id_cited") %>%
select(source = item_id_citing.x, target = item_id_citing.y) %>%
filter(source < target) %>%
group_by(source, target) %>%
reframe(weight = n()) %>%
mutate(type = "undirected") %>%
unique() %>%
write_csv("../data/networks/net_bc.csv")
# DC
citations %>%
rename(source = item_id_citing, target = item_id_cited) %>%
mutate(weight = 1,
type="directed") %>%
unique() %>%
write_csv("../data/networks/net_dc.csv")
# CC
citations %>%
inner_join(citations, by="item_id_citing") %>%
select(source = item_id_cited.x, target = item_id_cited.y) %>%
filter(source < target) %>%
group_by(source, target) %>%
summarize(weight = n()) %>%
mutate(type = "undirected") %>%
unique() %>%
write_csv("../data/networks/net_cc.csv")
# BC_CC_DC
bind_rows(read_csv("../data/networks/net_bc.csv"),
read_csv("../data/networks/net_cc.csv"),
read_csv("../data/networks/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks/net_bc_cc_dc.csv")
# BC_CC
bind_rows(read_csv("../data/networks/net_bc.csv"),
read_csv("../data/networks/net_cc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks/net_bc_cc.csv")
# BC_DC
bind_rows(read_csv("../data/networks/net_bc.csv"),
read_csv("../data/networks/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks/net_bc_dc.csv")
# CC_DC
bind_rows(read_csv("../data/networks/net_cc.csv"),
read_csv("../data/networks/net_dc.csv")) %>%
group_by(source, target) %>%
summarize(weight = sum(weight)) %>%
write_csv("../data/networks/net_cc_dc.csv")## nodes files (with clusters) ----
# net_files <- tibble(file = list.files("../data/networks/")) %>%
# filter(str_starts(file, "net_"))
#
#
# for(file in net_files$file) {
# network <- graph_from_data_frame(read.csv(str_c("../data/networks/",file)), directed = F)
# V(network)$comp <- components(network)$membership
# #network <- induced_subgraph(network, V(network)$comp==1)
# V(network)$cluster_louvain <- cluster_louvain(network, weights = c(E(network)$Weight))$membership
# V(network)$cluster_leiden <- cluster_leiden(network, weights = c(E(network)$Weight))$membership
# # V(network)$degree <- degree(network)
# # V(network)$closeness <- closeness(network)
# # V(network)$eigen_centrality <- eigen_centrality(network)$vector
# select(as_data_frame(network, "both")$vertices,
# id = name,
# component = comp,
# cluster_louvain,
# cluster_leiden) %>%
# write_csv(str_c("../data/networks/",str_replace(file, "net", "nodes")))
# }clusters_bc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_bc") %>%
mutate(set = "expanded",
network_method = "bc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_bc_cc") %>%
mutate(set = "expanded",
network_method = "bc_cc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_bc_cc_dc") %>%
mutate(set = "expanded",
network_method = "bc_cc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_bc_dc") %>%
mutate(set = "expanded",
network_method = "bc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_cc") %>%
mutate(set = "expanded",
network_method = "cc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_cc_dc") %>%
mutate(set = "expanded",
network_method = "cc_dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_dc<-read_xlsx("../data/networks/clusters_louvain.xlsx",
sheet = "nodes_dc") %>%
mutate(set = "expanded",
network_method = "dc",
cluster_method = "louvain") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters<-bind_rows(clusters_bc,
clusters_cc,
clusters_dc,
clusters_bc_cc,
clusters_bc_dc,
clusters_cc_dc,
clusters_bc_cc_dc) %>%
unique() %>%
rename(clustering_method=cluster_method) %>%
rownames_to_column("id") %>%
mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>%
mutate(set = "expanded")
# dbWriteTable(db, "clusters", clusters, row.names=F, append = T)
works_clusters_bc<-read.csv("../data/networks/nodes_bc.csv") %>%
mutate(network_method = "bc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc<-read.csv("../data/networks/nodes_cc.csv") %>%
mutate(network_method = "cc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_dc<-read.csv("../data/networks/nodes_dc.csv") %>%
mutate(network_method = "dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc<-read.csv("../data/networks/nodes_bc_cc.csv") %>%
mutate(network_method = "bc_cc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_dc<-read.csv("../data/networks/nodes_bc_dc.csv") %>%
mutate(network_method = "bc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_cc_dc<-read.csv("../data/networks/nodes_cc_dc.csv") %>%
mutate(network_method = "cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters_bc_cc_dc<-read.csv("../data/networks/nodes_bc_cc_dc.csv") %>%
mutate(network_method = "bc_cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_louvain)
works_clusters<-bind_rows(works_clusters_bc,
works_clusters_cc,
works_clusters_dc,
works_clusters_bc_cc,
works_clusters_bc_dc,
works_clusters_cc_dc,
works_clusters_bc_cc_dc) %>%
unique() %>%
inner_join(clusters %>%
select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>%
select(work_id, cluster_id = id)
# dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T) clusters_bc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_bc") %>%
mutate(set = "expanded",
network_method = "bc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_bc_cc") %>%
mutate(set = "expanded",
network_method = "bc_cc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_cc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_bc_cc_dc") %>%
mutate(set = "expanded",
network_method = "bc_cc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_bc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_bc_dc") %>%
mutate(set = "expanded",
network_method = "bc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_cc") %>%
mutate(set = "expanded",
network_method = "cc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_cc_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_cc_dc") %>%
mutate(set = "expanded",
network_method = "cc_dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters_dc<-read_xlsx("../data/networks/clusters_leiden.xlsx",
sheet = "nodes_dc") %>%
mutate(set = "expanded",
network_method = "dc",
cluster_method = "leiden") %>%
select(network_method,
cluster_method,
component,
cluster)
clusters<-bind_rows(clusters_bc,
clusters_cc,
clusters_dc,
clusters_bc_cc,
clusters_bc_dc,
clusters_cc_dc,
clusters_bc_cc_dc) %>%
unique() %>%
rename(clustering_method=cluster_method) %>%
rownames_to_column("id") %>%
mutate(id = as.numeric(id)+dbGetQuery(db, "select max(id) as max_id from clusters")$max_id) %>%
mutate(set = "expanded")
# dbWriteTable(db, "clusters", clusters, row.names=F, append = T)
works_clusters_bc<-read.csv("../data/networks/nodes_bc.csv") %>%
mutate(network_method = "bc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc<-read.csv("../data/networks/nodes_cc.csv") %>%
mutate(network_method = "cc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_dc<-read.csv("../data/networks/nodes_dc.csv") %>%
mutate(network_method = "dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc<-read.csv("../data/networks/nodes_bc_cc.csv") %>%
mutate(network_method = "bc_cc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_dc<-read.csv("../data/networks/nodes_bc_dc.csv") %>%
mutate(network_method = "bc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_cc_dc<-read.csv("../data/networks/nodes_cc_dc.csv") %>%
mutate(network_method = "cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters_bc_cc_dc<-read.csv("../data/networks/nodes_bc_cc_dc.csv") %>%
mutate(network_method = "bc_cc_dc") %>%
select(work_id = id, network_method, component, cluster = cluster_leiden)
works_clusters<-bind_rows(works_clusters_bc,
works_clusters_cc,
works_clusters_dc,
works_clusters_bc_cc,
works_clusters_bc_dc,
works_clusters_cc_dc,
works_clusters_bc_cc_dc) %>%
unique() %>%
inner_join(clusters %>%
select(id, network_method, component, cluster), by=c("network_method","component","cluster")) %>%
select(work_id, cluster_id = id)
# dbWriteTable(db,"works_clusters", works_clusters, row.names = F, append = T) ## List of nodes files ----
node_files <- tibble(file = list.files("../data/networks/")) %>%
filter(str_starts(file, "nodes_")) %>%
mutate(file = str_c("../data/networks/",file))
net<-list()
for(file in node_files$file) {
net<-c(net, list(read_csv(file, show_col_types = F) %>%
rename(cluster = cluster_louvain)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks/"),"\\.csv")
## Cluster descriptions ----
i=1
clusters<-list()
for(i in 1:length(net)) {
### Component ----
cluster_component <- net[[i]] %>%
select(cluster, component) %>%
unique()
### Size ----
cluster_size<-net[[i]] %>%
group_by(cluster) %>%
summarize(n = n())
### Number of core papers ----
load("../data/open_science_init_set.RData")
cluster_core_papers<-net[[i]] %>%
inner_join(open_science_init_set %>%
select(id = item_id),
by="id") %>%
group_by(cluster) %>%
summarize(n_core = n())
rm(open_science_init_set)
### Journals ----
clusters_journals <- net[[i]] %>%
inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
by="id") %>%
mutate(total = n()) %>%
group_by(journal) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, journal) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,
pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, journal) %>%
mutate(journal = paste(journal, collapse="; ")) %>%
unique() %>%
ungroup()
### keywords
clusters_keywords <- net[[i]] %>%
inner_join(dbGetQuery(db,
"SELECT DISTINCT a.id, c.keyword
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id"),
by="id") %>%
mutate(total = n()) %>%
mutate(keyword = lemmatize_strings(keyword)) %>%
group_by(keyword) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, keyword) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,
pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, keyword) %>%
mutate(keyword = paste(keyword, collapse="; ")) %>%
unique() %>%
ungroup()
### Most cited papers ----
cluster_citations <- net[[i]] %>%
inner_join(dbGetQuery(db,
"
SELECT DISTINCT
a.id,
a.title,
a.pub_year,
a.journal,
count(distinct b.item_id_citing) as cited_by_count
FROM works a
JOIN citations b on b.item_id_cited = a.id
GROUP BY a.id, a.title, a.pub_year, a.journal"),
by="id") %>%
mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>%
group_by(cluster) %>%
arrange(desc(cited_by_count)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, papers) %>%
mutate(papers = paste(papers, collapse="; ")) %>%
unique() %>%
ungroup()
### putting it all together----
x<- cluster_component %>%
inner_join(cluster_size, by="cluster") %>%
inner_join(cluster_core_papers, by="cluster") %>%
inner_join(clusters_journals, by="cluster") %>%
inner_join(clusters_keywords, by="cluster") %>%
inner_join(cluster_citations, by="cluster") %>%
rename("Number of publications" = n,
"Number of core publications" = n_core,
"Top journals" = journal,
"Top keywords" = keyword,
"Top cited papers" = papers)
x = list(x)
names(x)<-names(net)[i]
clusters<-c(clusters, x)
# writexl::write_xlsx(str_c("data/networks/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks/clusters_louvain.xlsx") ## List of nodes files ----
node_files <- tibble(file = list.files("../data/networks/")) %>%
filter(str_starts(file, "nodes_")) %>%
mutate(file = str_c("../data/networks/",file))
net<-list()
for(file in node_files$file) {
net<-c(net, list(read_csv(file, show_col_types = F) %>%
rename(cluster = cluster_leiden)))
}
names(net)<-str_remove(str_remove(node_files$file,"../data/networks/"),"\\.csv")
## Cluster descriptions ----
i=1
clusters<-list()
for(i in 1:length(net)) {
### Component ----
cluster_component <- net[[i]] %>%
select(cluster, component) %>%
unique()
### Size ----
cluster_size<-net[[i]] %>%
group_by(cluster) %>%
summarize(n = n())
### Number of core papers
load("../data/open_science_init_set.RData")
cluster_core_papers<-net[[i]] %>%
inner_join(open_science_init_set %>%
select(id = item_id),
by="id") %>%
group_by(cluster) %>%
summarize(n_core = n())
rm(open_science_init_set)
### Journals ----
clusters_journals <- net[[i]] %>%
inner_join(dbGetQuery(db, "SELECT DISTINCT id, journal FROM works"),
by="id") %>%
mutate(total = n()) %>%
group_by(journal) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>%
group_by(cluster, journal) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(journal,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, journal) %>%
mutate(journal = paste(journal, collapse="; ")) %>%
unique() %>%
ungroup()
### keywords
clusters_keywords <- net[[i]] %>%
inner_join(dbGetQuery(db,
"SELECT DISTINCT a.id, c.keyword
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id"),
by="id") %>%
mutate(total = n()) %>%
mutate(keyword = lemmatize_strings(keyword)) %>%
group_by(keyword) %>%
mutate(n_papers = n()) %>%
mutate(pct_papers = n()/total) %>%
ungroup() %>%
group_by(cluster) %>%
mutate(cluster_total = n()) %>%
ungroup() %>% group_by(cluster, keyword) %>%
mutate(n_cluster = n()) %>%
mutate(pct_cluster = n_cluster/cluster_total) %>%
mutate(si = pct_cluster/pct_papers) %>%
ungroup() %>%
select(keyword,cluster, n_papers,pct_papers,n_cluster,cluster_total,pct_cluster, si) %>%
unique() %>%
group_by(cluster) %>%
arrange(desc(si)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, keyword) %>%
mutate(keyword = paste(keyword, collapse="; ")) %>%
unique() %>%
ungroup()
### Most cited papers ----
cluster_citations <- net[[i]] %>%
inner_join(dbGetQuery(db,
"
SELECT DISTINCT
a.id,
a.title,
a.pub_year,
a.journal,
count(distinct b.item_id_citing) as cited_by_count
FROM works a
JOIN citations b on b.item_id_cited = a.id
GROUP BY a.id, a.title, a.pub_year, a.journal"),
by="id") %>%
mutate(papers = str_c(title,". (",pub_year," )",". ",journal)) %>%
group_by(cluster) %>%
arrange(desc(cited_by_count)) %>%
mutate(rank = row_number()) %>%
filter(rank <= 10) %>%
select(cluster, papers) %>%
mutate(papers = paste(papers, collapse="; ")) %>%
unique() %>%
ungroup()
### putting it all together----
x<- cluster_component %>%
inner_join(cluster_size, by="cluster") %>%
inner_join(cluster_core_papers, by="cluster") %>%
inner_join(clusters_journals, by="cluster") %>%
inner_join(clusters_keywords, by="cluster") %>%
inner_join(cluster_citations, by="cluster") %>%
rename("Number of publications" = n,
"Number of core publications" = n_core,
"Top journals" = journal,
"Top keywords" = keyword,
"Top cited papers" = papers)
x = list(x)
names(x)<-names(net)[i]
clusters<-c(clusters, x)
# writexl::write_xlsx(str_c("data/networks/cluster_table_net_",names(net)[i],".xlsx"))
}
writexl::write_xlsx(clusters, "../data/networks/clusters_leiden.xlsx") #Loading library
# New graph -----
works_clusters<-dbReadTable(db,"works_clusters")
clusters_rk <- dbReadTable(db, "clusters") %>%
filter(set == "expanded") %>%
filter(clustering_method == "louvain") %>%
inner_join(works_clusters, by=c("id" = "cluster_id")) %>%
group_by(id, network_method) %>%
mutate(size = n()) %>%
mutate(n_core = sum(core)) %>%
select(cluster_id = id, size, n_core, network_method) %>%
unique() %>%
group_by(network_method) %>%
mutate(rk = rank(desc(size))) %>%
ungroup() %>%
select(cluster_id, rk)
clusters<-dbReadTable(db, "clusters")
works<-dbGetQuery(db, "select distinct id from works")
data <- dbReadTable(db, "works_clusters") %>%
as_tibble() %>%
inner_join(clusters, by=c("cluster_id"="id")) %>%
inner_join(works, by=c("work_id"="id")) %>%
inner_join(clusters_rk, by="cluster_id") %>%
mutate(core = ifelse(work_id %in% open_science_init_set$item_id,T,F))
data %>%
group_by(network_method, core) %>%
summarize(n = n()) %>%
ungroup() %>%
group_by(network_method) %>%
mutate(total = sum(n)) %>%
ungroup()
data %>%
ggplot() +
aes(rk, fill=core) +
geom_bar() +
facet_wrap(facets = "network_method")
data %>%
filter(core == T) %>%
ggplot() +
aes(rk) +
geom_bar() +
facet_wrap(facets = "network_method")
clusters_overall2 <- dbGetQuery(db, "select * from clusters c
join works_clusters wc on wc.cluster_id = c.id" )
ggplot(clusters_overall2) +
aes(cluster_number) +
geom_histogram() +
facet_wrap(facets = "network_method")keywords_freq<- dbGetQuery(db,
"SELECT DISTINCT c.keyword, COUNT(DISTINCT a.id) as n
FROM works a
JOIN works_keywords b on b.work_id = a.id
JOIN keywords c on c.id = b.keyword_id
GROUP BY c.keyword
ORDER BY n DESC")
writexl::write_xlsx(keywords_freq, "../data/keywords_freq.xlsx")The tables below do not work well in dark mode, switch to light mode to explore the data.
The methods available are: BC, BC-CC, BC-CC-DC, BC-DC, CC, CC-DC, DC with as a prefix the clustering method that was used (-Louvain or Leiden). For example: BC-CC-Louvain is the clusters identified with the Louvain community detection algorith in the Bibliographic coupling + co-citation network.